This notebook explores the possibility of selecting the right Rustc complilation flags in order to optimize for build time, run time, and binary size. This is a particularly important issue in the Rust community as Rust compilation times are notoriously long. Specifically, a selection of certain compilation flags can reproduce certain runtime or binary size requirements while minimizing compile time. This is ideal for a tight development loop. The following are the options that this demo is able to optimize in order to discover the pareto frontier:
%%HTML
<script src="require.js"></script>
import os
import time
import subprocess
import sys
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook' # allows for rendering outside of jupyter
import pickle
import numpy as np
import pandas as pd
from scipy.stats import t
from mlos.Optimizers.BayesianOptimizerConfigStore import bayesian_optimizer_config_store
from mlos.Optimizers.BayesianOptimizerFactory import BayesianOptimizerFactory
from mlos.Spaces import Point
from mlos.Optimizers.OptimizationProblem import OptimizationProblem, Objective
from mlos.Optimizers.BayesianOptimizer import BayesianOptimizer
from mlos.Spaces import SimpleHypergrid, ContinuousDimension, CategoricalDimension
from bokeh.io import output_notebook, show
from mlos.OptimizerMonitoring.Tomograph.ModelTomograph2 import ModelTomograph2
Here is where I setup the optimizer and give it the parameters that it has to play with and the goals that it is optimizing for. Note, the options are passed in as integers to be converted later. This is due to certain formatting issues and pluggability requirements
input_space = SimpleHypergrid(name="input", dimensions= [
CategoricalDimension(name="embed-bitcode", values=list(range(-1, 1 +1))),
CategoricalDimension(name="opt-level", values=list(range(-1, 5 +1))),
CategoricalDimension(name="force-frame-pointers", values=list(range(-1, 1 +1))),
CategoricalDimension(name="force-unwind-tables", values=list(range(-1, 1 +1))),
CategoricalDimension(name="linker-flavor", values=list(range(-1, 1 +1))),
CategoricalDimension(name="lto", values=list(range(-1, 2 +1))),
CategoricalDimension(name="codegen-units", values=list(range(-1, 6 +1))),
CategoricalDimension(name="no-vectorize-loops", values=list(range(-1, 0 +1))),
CategoricalDimension(name="no-vectorize-slp", values=list(range(-1, 0 +1))),
CategoricalDimension(name="overflow-checks", values=list(range(-1, 1 +1))),
CategoricalDimension(name="target-cpu", values=list(range(-1, 0 +1))),
CategoricalDimension(name="relocation-model", values=list(range(-1, 1 +1))),
CategoricalDimension(name="panic", values=list(range(-1, 1 +1))) ])
output_space = SimpleHypergrid(name="obj", dimensions= [
ContinuousDimension(name="compile-time-s",min=0,max=1000),
ContinuousDimension(name="run-time-s",min=0, max=1000),
ContinuousDimension(name="binary-size-mb",min=0,max=1000) ])
optimization_problem=OptimizationProblem(
parameter_space=input_space,
objective_space=output_space,
objectives=[
Objective(name="compile-time-s",minimize=True),
Objective(name="run-time-s",minimize=True),
Objective(name="binary-size-mb",minimize=True) ])
# Run a specified sh command in the directory specified by basedir.
# Returns: a list of size two containing the follwing entries: standardout of the process, and total execution time
def run_cmd(cmd,base_dir="."):
print("Running: "+cmd)
start_time=time.perf_counter()
process = subprocess.Popen(cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=base_dir)
out, err = process.communicate()
end_time=time.perf_counter()
err=err.decode("utf-8")
out=out.decode("utf-8")
if err.__contains__("failed") or err.__contains__("could not compile"):
print(err)
return [out, FAILED_PENALTY_COMPILE_TIME_S]
else:
return [out, end_time-start_time]
# Runs the following four operations:
# 1) Delete the generated target directory to ensure a clean compile
# 2) Run "cargo rustc" (build)
# 3) Run "cargo run" (run)
# 4) Determine the size of the compiled binary in megabytes
# Returns: [compile time in seconds, run time in seconds, binary time in seconds]
def run_compile_test():
base_dir="bevybench"
run_cmd("cargo clean", base_dir=base_dir)
compile_time=run_cmd("cargo rustc",base_dir)[1]
if compile_time==FAILED_PENALTY_COMPILE_TIME_S:
run_time=FAILED_PENALTY_RUN_TIME_S
binary_size_mb=FAILED_PENALTY_BINARY_SIZE_MB
else:
run_time=run_cmd("cargo run",base_dir)[1]
du_result=run_cmd("du -B %s target/debug/bevybench"%(2**20),base_dir)[0]
binary_size_mb=int(du_result.split("\t")[0])
print([compile_time,run_time,binary_size_mb])
return [compile_time,run_time,binary_size_mb]
# List of parameters and their possible values:
# code-model (tiny,small,kernel,medium,large)
# codegen-units (0-512)
# embed-bitcode (yes, no,default)
# force-frame-pointers (yes,no,default)
# force-unwind-tables (yes, no,default)
# linker-flavor (gcc,ld,unspecified)
# lto (yes, no, thin,default)
# no-vertorize-loops (present/notpresent)
# no-vectorize-slp (present/notpresent)
# opt-level (0,1,2,3,s,z)
# overflow-checks (yes,no,default)
# tune-cpu (enabled,disabled) (value=native)
# relocation-model (static,pic,default)
# panic (abort, unwind, default)
code_model_str=["small","kernel","medium","large"]
codegen_units_str=["1","2","8","16","64","256","512"]
opt_level_str=["0","1","2","3","s","z"]
# Given a python dictionary of the form {"param":val} (ex: {"tune-cpu":0}) it will return the appropriate string to put into the RUSTFLAGS environement variable
# Ex: "-C tune-cpu=native -C no-vectorize-loops"
# Returns: String for RUSTFLAGS given dict from optimizer
def gen_param_string(frame):
params=""
for key in frame:
key = key[0]
if not frame[key]==-1:
params += "-C %s%s "%(key,get_param_value(key,frame[key]))
return params
# Subroutine for gen_param_string(..), matches integer value to its string representation for the specific key
# Returns: String
def get_param_value(k,v):
if k=="codegen-units":
return "="+codegen_units_str[v]
if k=="embed-bitcode" or k=="force-frame-pointers" or k=="force-unwind-tables" or k=="overflow-checks" :
return "=yes" if v==0 else "=no"
if k=="linker-flavor":
return "=gcc" if v==0 else "=ld"
if k=="lto":
if v==0:
return "=yes"
if v==1:
return "=no"
if v==2:
return "=thin"
if k=="opt-level":
return "="+opt_level_str[v]
if k=="target-cpu":
return "=native"
if k=="no-vectorize-loops" or k=="no-vectorize-slp":
return ""
if k=="relocation-model":
return "=static" if v==0 else "=pic"
if k=="panic":
return "=abort" if v==0 else "=unwind"
print("Invalid parameter found in get_param_value: "+k)
return "Invalid"
This step sets up a tool that is critical for the optimizer. Some of the configurations will result in a failed compile (specifically surrounding the link time optimizer compile option), as such, the model needs to know not to keep exploring those options. Multiplying the base case by a large number (10 in this case) ensures that these points will almost certainly be worse than anything the optimizer would encounter with a successful compile
# Setup penalties for a failed compile. This ensures flexibility across a number of different benchmarks
os.environ["RUSTFLAGS"]=""
# These constants are used in the compilation. Set them temporarily.
FAILED_PENALTY_COMPILE_TIME_S = 0
FAILED_PENALTY_RUN_TIME_S = 0
FAILED_PENALTY_BINARY_SIZE_MB = 0
baseline=run_compile_test()
# Reset the constants to the new values.
FAILED_PENALTY_COMPILE_TIME_S = baseline[0]*10
FAILED_PENALTY_RUN_TIME_S = baseline[1]*10
FAILED_PENALTY_BINARY_SIZE_MB = baseline[2]*10
Running: cargo clean Running: cargo rustc Running: cargo run Running: du -B 1048576 target/debug/bevybench [41.4023454580456, 27.29031689907424, 165]
# Setup optimizer configs
optimizer_config = bayesian_optimizer_config_store.default
optimizer_config.experiment_designer_config.fraction_random_suggestions = .1
random_forest_config = optimizer_config.homogeneous_random_forest_regression_model_config
random_forest_config.decision_tree_regression_model_config.n_new_samples_before_refit = 1
random_forest_config.decision_tree_regression_model_config.splitter = 'best'
random_forest_config.samples_fraction_per_estimator = .8
random_forest_config.n_estimators = 20
optimizer_config.experiment_designer_config.confidence_bound_utility_function_config.alpha = 0.1
optimizer_factory = BayesianOptimizerFactory()
optimizer = optimizer_factory.create_local_optimizer(
optimization_problem=optimization_problem,
optimizer_config=optimizer_config
)
# Save those penalties as attributes of the optimizer for storage purposes
optimizer.FAILED_PENALTY_COMPILE_TIME_S=FAILED_PENALTY_COMPILE_TIME_S
optimizer.FAILED_PENALTY_RUN_TIME_S=FAILED_PENALTY_RUN_TIME_S
optimizer.FAILED_PENALTY_BINARY_SIZE_MB=FAILED_PENALTY_BINARY_SIZE_MB
Perform 1000 iterations of getting suggestions, evaluating them to see performance, and then getting new suggestions. I save the optimizer every 100 iterations. Note: Many of these may be very fast due to selecting incompatable compilation flags
# Perform an optimization run
# Returns: nothing
def perform_run():
# Get a suggetion for the parameters
suggested_value = optimizer.suggest()
input_values_df = suggested_value.to_dataframe()
# Create the parameter string for the run and put it in the RUSTFLAGS env variable
param_string=gen_param_string(suggested_value)
print(param_string)
os.environ["RUSTFLAGS"]=param_string
# Run the test
target_value = run_compile_test()
print(suggested_value.to_json(), target_value)
target_values_df = pd.DataFrame({'compile-time-s': [target_value[0]],
"run-time-s":[target_value[1]],
"binary-size-mb":[target_value[2]]})
# Save it to the optimizer and regenerate the model
optimizer.register(input_values_df, target_values_df)
num_iter=1000
for x in range(num_iter):
perform_run()
# Save progress every 100 iterations
if ((x+1)%100)==0:
filehandle=open("optimizer_"+str(x)+".obj",'wb')
pickle.dump(optimizer,filehandle,pickle.HIGHEST_PROTOCOL)
# Save the optimizer when done
filehandle=open("optimizer_done_1000.obj",'wb')
pickle.dump(optimizer,filehandle,pickle.HIGHEST_PROTOCOL)
(Make sure that you have the optimizer written to a file as this will re-load the optimizer from a file)
# Reload the optimizer and its parameters regarding penalties.
filehandle=open("optimizer_done_1000.obj",'rb')
optimizer=pickle.load(filehandle)
FAILED_PENALTY_COMPILE_TIME_S=optimizer.FAILED_PENALTY_COMPILE_TIME_S
FAILED_PENALTY_RUN_TIME_S=optimizer.FAILED_PENALTY_RUN_TIME_S
FAILED_PENALTY_BINARY_SIZE_MB=optimizer.FAILED_PENALTY_BINARY_SIZE_MB
# Load all the observations from the optimizer
observations=optimizer.get_all_observations()
observations_df=pd.concat(optimizer.get_all_observations(),axis=1)
# Indicate the points that are along the pareto frontier
observations_df['is_pareto']=False
observations_df.loc[optimizer.pareto_frontier.pareto_df.index,'is_pareto']=True
observations_df=observations_df[observations_df['compile-time-s']<FAILED_PENALTY_COMPILE_TIME_S]
observations_df['class']="non-pareto"
observations_df.loc[observations_df['is_pareto'],'class']="pareto"
# This should be filled in manually. If you wish to add additional points to compare against, put them in here
# The first is the result of "time cargo build; time cargo run"
# The second is the result of "time cargo build --release; time cargo run --release"
benchmark_observations_df = pd.DataFrame({'compile-time-s':[41.39,70.05],'run-time-s':[26.82,4.35],'binary-size-mb':[165,16],'is_pareto':[True,True],'class':['cargo debug','cargo release']})
observations_df=observations_df.append(benchmark_observations_df)
observations_df["class"] = observations_df["class"].astype(str)
These are the possible options for ideal compile vs run time vs binary size
print("Pareto optimal times:")
optimizer.pareto_frontier.pareto_df
Pareto optimal times:
| compile-time-s | run-time-s | binary-size-mb | |
|---|---|---|---|
| 131 | 37.785157 | 25.180844 | 144 |
| 788 | 37.946447 | 25.667808 | 144 |
| 118 | 37.956711 | 28.385938 | 144 |
| 744 | 38.083198 | 25.476631 | 144 |
| 95 | 38.118935 | 28.177435 | 144 |
| 265 | 38.152862 | 25.39087 | 144 |
| 815 | 38.331182 | 25.597167 | 144 |
| 85 | 38.339696 | 25.174156 | 157 |
| 122 | 38.371449 | 25.734871 | 144 |
| 138 | 38.435726 | 24.991312 | 160 |
| 981 | 38.516636 | 25.170927 | 157 |
| 368 | 38.583499 | 25.454201 | 144 |
| 958 | 38.623887 | 25.062429 | 157 |
| 182 | 38.980438 | 27.764803 | 144 |
| 141 | 39.061868 | 28.21017 | 144 |
| 871 | 39.1048 | 25.54276 | 126 |
| 242 | 39.263489 | 25.673851 | 126 |
| 247 | 39.321013 | 25.866137 | 126 |
| 82 | 39.425668 | 25.854494 | 126 |
| 912 | 39.596829 | 25.464819 | 126 |
| 830 | 39.69728 | 25.641174 | 126 |
| 781 | 39.701413 | 25.638847 | 126 |
| 580 | 39.798785 | 25.607595 | 126 |
| 706 | 39.807819 | 25.679126 | 126 |
| 846 | 39.808955 | 26.016054 | 125 |
| 279 | 39.829565 | 25.7581 | 126 |
| 152 | 39.863884 | 26.105375 | 125 |
| 83 | 39.867511 | 25.672488 | 126 |
| 50 | 39.96381 | 25.876501 | 126 |
| 782 | 40.084136 | 25.740959 | 126 |
| 849 | 40.428845 | 26.199866 | 125 |
| 847 | 40.459182 | 25.571051 | 126 |
| 795 | 41.143566 | 25.58113 | 126 |
| 350 | 41.170804 | 24.937431 | 218 |
| 923 | 41.384206 | 24.923701 | 210 |
| 732 | 42.485402 | 25.418109 | 120 |
| 125 | 42.636635 | 25.967999 | 119 |
| 256 | 42.680373 | 25.914792 | 120 |
| 236 | 42.940996 | 25.66679 | 120 |
| 935 | 43.156986 | 24.97653 | 210 |
| 742 | 43.190142 | 25.651082 | 120 |
| 41 | 43.375846 | 25.799742 | 120 |
| 723 | 44.039039 | 25.60308 | 119 |
| 898 | 55.569796 | 4.352302 | 139 |
| 976 | 56.194269 | 4.421138 | 139 |
| 886 | 57.895745 | 4.333575 | 167 |
| 950 | 57.957251 | 4.346791 | 164 |
| 158 | 67.498019 | 4.347559 | 138 |
| 313 | 67.749282 | 4.496721 | 138 |
| 161 | 75.005001 | 4.341556 | 104 |
| 8 | 75.870305 | 4.330907 | 168 |
| 917 | 76.828226 | 4.317957 | 180 |
plot_data_df=observations_df#observations_df[observations_df["opt-level"]==4]
def show_plotly_xy_scatter_pareto_colored(plot_data_df,title=""):
# The 2D plot without binary size as that is generally not a super important factor
fig = px.scatter(
plot_data_df,
x=plot_data_df['compile-time-s'],
y=plot_data_df['run-time-s'],
title=title,
hover_data=[column for column in plot_data_df.columns],
color=plot_data_df['class']
)
fig.show()
# The 3D plot with binary size as the third dimension
fig = px.scatter_3d(
plot_data_df,
x=plot_data_df['compile-time-s'],
y=plot_data_df['run-time-s'],
z=plot_data_df['binary-size-mb'],
title=title,
hover_data=[column for column in plot_data_df.columns],
color=plot_data_df['class']
)
fig.show()
show_plotly_xy_scatter_pareto_colored(observations_df,title="All observations")
for x in range(0, 5 +1):
show_plotly_xy_scatter_pareto_colored(observations_df[observations_df['opt-level']==x],title="Observations where opt-level=%s"%(opt_level_str[x]))
This visualization is designed to help explain the distance to the pareto frontier given certain individual parameters. The lto flag has particularly interesting interactions with opt-level and codegen-units. (The lighter the value, the worse the performance)
from mlos.OptimizerMonitoring.Tomograph.ModelTomograph import ModelTomograph
output_notebook()
tomograph = ModelTomograph(optimizer=optimizer)
tomograph.plot()
This has proven to be a very fruitful exercise as the graph of "All Observations" demonstrates that while the debug build might not be worth changing at this point in time, if you want release style performance with a lower compile time, the correct set of flags was able to save 27% on compile time while keeping equivalent run time performance. In this case, this performance was also optimal across two machines of different hardware specifications. However this may not always be the case.
If you have time, replace my bevybench rust program with your own such that when you run "cargo run", it runs a benchmark of your program. Then, let the optimizer go to town overnight and see how much time can be saved on your compilation via ideal selection of flags for your machine and for your code.